library(dplyr)
library(ggplot2)
library(tidytext)
library(ggthemes)
library(patchwork)
library(ggrepel)
rm(list = ls())
# Load news
load("../output/news_df.Rda")
model_df <- news_df %>%
dplyr::mutate(date = as.Date(date),
type = "news",
source = medium) %>%
select(date,title,title_text,text_length,text_cleaned,type,source) %>%
dplyr::mutate(doc_index = as.numeric(rownames(.)),
election_dummy = as.factor(ifelse(date <= as.Date("24Sep2017","%d%b%Y"),"pre","post"))
year_week = lubridate::floor_date(date, "1 week"))
Fehler: unerwartetes Symbol in:
" election_dummy = as.factor(ifelse(date <= as.Date("24Sep2017","%d%b%Y"),"pre","post"))
year_week"
Research question:
Is there a significant difference in the way media reports before / after the elections?
max(model_df_sent$date) - as.Date("24.09.2017", "%d.%m.%Y")
Fehler: Objekt 'model_df_sent' nicht gefunden
min(model_df$date) - as.Date("24.09.2017", "%d.%m.%Y")
model_df %>%
ggplot(aes(source, fill = election_dummy)) +
geom_bar(position = "dodge") +
theme_hc() +
#scale_fill_discrete(labels = c("Pre election\n(01.06.17 - 23.09.17)", "Post election\n(24.09.17 - 13.02.18)")) +
theme(axis.text.x = element_text(angle = 90), legend.title = element_blank()) +
labs(x=NULL,y="# articles")
Method:
How to measure “the way media reports” ? –> Need to find a metric that represents the media slant
To measure the tone (or sentiment) of an article a dictionary-based method is applied. To conduct such an analysis, a list of words (dictionary) associated with a given emotion, such as negativity is pre-defined. The document is then deconstructed into individual words and each word is assigned a sentiment value according to the dictionary, where the sum of all values results in the emotional score for the given document. Such lexical or “bag-of-words” approaches are widely presented in the finance literature to determine the effect of central banks’ monetary policy communications on asset prices and real variables ( , ). use a similar approach to measure “the two Ts” (Topic and tone). They explore the effects of FOMC (Federal Open Market Committee) statements on both market and real economic variables. To calculate their score, they subtract the negative words from the positive words und divide this by the number of total words of the statement. A similar score is used by , who measure the effect of narratives and sentiment of financial market text-based data on developments in the financial system. They count the number of occurrences of excitement words and anxiety words and then scale these numbers by the total text size as measured by the number of characters.
The present paper uses a dictionary that lists words associated with positive and negative polarity weighted within the interval of \([-1; 1]\). SentimentWortschatz, is a publicly available German-language resource for sentiment analysis, opinion mining, etc.. The current version of SentiWS (v1.8b) contains 1,650 positive and 1,818 negative words, which sum up to 15,649 positive and 15,632 negative words including their inflections, respectively. Table shows ten examples entries of the dictionary. To obtain a more reliable correlation between the “target” (a political party) and the word’s polarity score, sentiment words are counted in a window of two sentences before and after the mention of a political party . The tonality score of an article is then calculated from the sum of the these words divided by the total number of words in that article. Again, the tonality bias for is then computed as the deviation of each party’s specific tonality from the average tonality of all other parties in that outlet and standardized to range from −1 to 1.
The score is then calculated from the sum of the words in a document (which can be assigned to a word from the dictionary) divided by the total number of words in that document.
# Connect with tokens
tokens <- model_df %>% unnest_tokens(word, text_cleaned)
sentTxt_token <- left_join(tokens, SentiWS, by = "word") %>%
mutate(
polarity = ifelse(is.na(polarity), 0, polarity),
senti_dummy = ifelse(is.na(senti_dummy),0, senti_dummy)
)
How many sentiment words?
sentiment_word_counts <- sentTxt_token %>%
mutate(
sentiment = ifelse(senti_dummy == 1, "positive", "neutral"),
sentiment = ifelse(senti_dummy == -1, "negative", sentiment),
sentiment = ifelse(senti_dummy == 0, "neutral", sentiment)
) %>%
group_by(source) %>%
dplyr::mutate(total_words = n()) %>%
ungroup() %>%
group_by(sentiment, source, total_words) %>%
dplyr::summarise(n = n()) %>%
mutate(freq = n/total_words)
p1 <- ggplot(sentiment_word_counts, aes(reorder(source, n),freq, fill=sentiment)) +
geom_col() +
coord_flip() +
hrbrthemes::theme_ipsum() +
scale_fill_viridis_d() +
labs(y="%", x=NULL, title = "Sentiment vs non-sentiment words") +
theme(legend.title = element_blank(),
legend.position = "bottom")
p2 <- sentiment_word_counts %>%
filter(sentiment != "neutral") %>%
ggplot(aes(reorder(source,n), freq, fill=sentiment)) +
geom_col(position = "dodge") +
coord_flip() +
hrbrthemes::theme_ipsum() +
scale_fill_viridis_d() +
labs(y="%", x = NULL) +
theme(
axis.text.y = element_blank(),
legend.position = "none"
)
p1 + p2
Calculate sentiment mean by document
model_df_sent <- sentTxt_token %>%
group_by(doc_index) %>%
dplyr::summarise(
total_words_document = n(),
polarity = mean(polarity),
senti_dummy = mean(senti_dummy)
) %>%
left_join(model_df, by="doc_index")
model_df_sent %>%
group_by(source, election_dummy) %>%
# calculate the number of total articles
mutate(n_articles = n()) %>%
ungroup() %>%
# calculate the overall sentiment / number of articles of a source before and after the elections
mutate(senti_dummy_norm = senti_dummy/n_articles,
polartiy_norm = polarity/n_articles) %>%
ggplot(aes(source,polartiy_norm*1000, fill=factor(election_dummy))) +
geom_boxplot() +
hrbrthemes::theme_ipsum() +
scale_fill_viridis_d(labels = c("Post election", "Pre election")) +
theme(legend.title = element_blank(),
legend.position = "bottom") +
labs(x = NULL, y = NULL, title="Sentiment value of news articles",
subtitle = "Normalized by the number of total articles of each medium")
p <- model_df_sent %>%
group_by(year_week, source) %>%
dplyr::summarise(
article_n = n(),
polarity = mean(polarity)
) %>%
ggplot(aes(year_week, polarity*100, group=source, color=source,
text = paste0("Medium: ", source,
"\nSentiment: ", round(polarity*100,2),
"\nWeek: ", year_week,
"\nTotal articles: ", article_n))) +
geom_line() +
geom_vline(xintercept = as.Date("24.09.2017", "%d.%m.%Y"), linetype=2 )+
hrbrthemes::theme_ipsum() +
scale_color_viridis_d() +
labs(x=NULL, y=NULL, title="Sentiment value",
subtitle = "Grouped by week and medium") +
theme(legend.title = element_blank(),
legend.position = "bottom") +
scale_x_date(date_breaks="4 weeks", date_labels = "%W-%Y")
plotly::ggplotly(p, tooltip="text")
Inspect articles around 2017-07-09